library(readxl)
library(tidyr)
library(readr)

# reading in new datasets
PNHP_Liaa <- read_excel("PNHP_Liaa.xlsx", 
                        skip = 1)
View(PNHP_Liaa)

PNHP_Kacie <- read_excel("PNHP_Kacie.xlsx", 
                         skip = 1)
View(PNHP_Kacie)

P4AHCF_Kacie <- read_excel("P4AHCF_Kacie.xlsx",  
                           skip = 1)
View(P4AHCF_Kacie)

pn <- read_csv("PNHP_Complete.csv")

str(P4AHCF_Kacie)
str(p4pre)
str(p4post)

# Step 1: Standardize columns
# Ensure 'retweets' and 'favorites' in p4pre and p4post are of numeric type
p4pre$retweets <- as.numeric(p4pre$retweets)
p4pre$favorites <- as.numeric(p4pre$favorites)

p4post$retweets <- as.numeric(p4post$retweets)
p4post$favorites <- as.numeric(p4post$favorites)

# Step 2: Convert 'created' column in P4AHCF_Kacie to POSIXct
P4AHCF_Kacie$created <- as.POSIXct(strptime(P4AHCF_Kacie$created, format = "%Y-%m-%d\n%H:%M:%S+00:00", tz = "UTC"))

# Convert 'is_retweet' in p4pre and p4post to logical
p4pre$is_retweet <- p4pre$is_retweet == "TRUE"
p4post$is_retweet <- p4post$is_retweet == "TRUE"

# Convert 'created' column to datetime format
P4AHCF_Kacie$created <- as.POSIXct(P4AHCF_Kacie$created, format="%Y-%m-%d %H:%M:%S", tz="UTC")
p4pre$created <- as.POSIXct(p4pre$created, tz="UTC")
p4post$created <- as.POSIXct(p4post$created, format="%Y-%m-%d %H:%M:%S", tz="UTC")

# Now, bind the datasets
p4 <- bind_rows(P4AHCF_Kacie, p4pre, p4post)

str(pn)
str(p4)

# Combine pn and p4 data
combined_data <- bind_rows(pn %>% mutate(account = "PNHP"), 
                           p4 %>% mutate(account = "P4AHCF"))


combined_data$mention <- ifelse(grepl("#MedicareForAll", combined_data$text) |
                                  grepl("#MedicareForAll", combined_data$hashtags), 1, 0)

tweets_count <- combined_data %>%
  group_by(account, month = floor_date(created, "month")) %>%
  summarise(n = sum(mention))

word_count <- combined_data %>%
  mutate(word_count = sapply(strsplit(text, " "), length)) %>%
  group_by(account, month = floor_date(created, "month")) %>%
  summarise(avg_word_count = mean(word_count))

retweets <- combined_data %>%
  group_by(account, week = floor_date(created, "week")) %>%
  summarise(total_retweets = sum(retweets))

favorites <- combined_data %>%
  group_by(account, week = floor_date(created, "week")) %>%
  summarise(total_favorites = sum(favorites))

# Aggregating data monthly
monthly_mentions <- combined_data %>%
  filter(mention == 1) %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarize(n = n()) %>%
  ungroup()


monthly_mentions$month <- as.Date(monthly_mentions$month)

ggplot(monthly_mentions, aes(x = month, y = n, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  geom_vline(aes(xintercept = as.Date("2020-03-11")), color = "red") +
  geom_text(aes(x = as.Date("2020-03-11"), y = max(n), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Time", y = "Number of #MedicareForAll Tweets", title = "") +
  theme_minimal() +
  scale_x_date(limits = c(as.Date(min(monthly_mentions$month)), as.Date(max(monthly_mentions$month))))

ggplot(monthly_word_counts, aes(x = month, y = average_word_count, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  geom_vline(aes(xintercept = as.Date("2020-03-11")), color = "red") +
  geom_text(aes(x = as.Date("2020-03-11"), y = max(average_word_count), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Time", y = "Average Word Count", title = "") +
  theme_minimal() +
  scale_x_date(limits = c(as.Date(min(monthly_word_counts$month)), as.Date(max(monthly_word_counts$month))))

# 1. For monthly_mentions
ggplot(monthly_mentions, aes(x = month, y = n, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  geom_vline(aes(xintercept = as.Date("2020-03-11")), color = "red") +
  geom_text(aes(x = as.Date("2020-03-11"), y = max(n), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Time", y = "Number of #MedicareForAll Tweets") +
  theme_minimal()

# Convert week column to Date class
retweets$week <- as.Date(retweets$week)

# Plotting the data for retweets with scaled y-axis
ggplot(retweets, aes(x = week, y = total_retweets, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  geom_vline(aes(xintercept = as.Date("2020-03-11")), color = "red") +
  geom_text(aes(x = as.Date("2020-03-11"), y = max(total_retweets), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Time", y = "Total Retweets (Scaled)") +
  theme_minimal() 
  # scale_y_continuous(limits = c(0, max(retweets$total_retweets) * 0.2))  # Adjust the scaling factor as needed

# Convert week column to Date class
favorites$week <- as.Date(favorites$week)

# Plotting the data for favorites with scaled y-axis
ggplot(favorites, aes(x = week, y = total_favorites, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  geom_vline(aes(xintercept = as.Date("2020-03-11")), color = "red") +
  geom_text(aes(x = as.Date("2020-03-11"), y = max(total_favorites), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Time", y = "Total Favorites (Scaled)") +
  theme_minimal() 

# Calculate average word count per tweet
average_word_counts <- combined_data %>%
  mutate(word_count = sapply(strsplit(text, " "), length)) %>%
  group_by(account, month = floor_date(created, "month")) %>%
  summarise(average_word_count = mean(word_count))

# Convert 'month' to Date class
average_word_counts$month <- as.Date(average_word_counts$month)

# Create a line plot for average word count
ggplot(average_word_counts, aes(x = month, y = average_word_count, color = account, linetype = account)) +
  geom_line() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  geom_vline(aes(xintercept = as.Date("2020-03-11")), color = "red") +
  geom_text(aes(x = as.Date("2020-03-11"), y = max(average_word_count), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Time", y = "Average Word Count", title = "") +
  theme_minimal() +
  scale_x_date(limits = c(as.Date(min(average_word_counts$month)), as.Date(max(average_word_counts$month))))

library(tidyverse)
library(tidyr)
library(tidytext)

head(combined_data)

# Assuming you have combined data as 'combined_data'
# Unnest and calculate sentiment scores
sentiments_data <- combined_data %>%
  unnest_tokens(word, text) %>%
  inner_join(get_sentiments("bing")) %>%
  count(created, account, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

# Find the maximum sentiment score for scaling the plot
max_sentiment <- max(sentiments_data$sentiment, na.rm = TRUE)

# Convert 'created' column to POSIXct if it's not already in that format
sentiments_data$created <- as.POSIXct(sentiments_data$created, format = "%Y-%m-%d %H:%M:%S")



# Aggregate data by weeks (you can change this to 'months' if needed)
sentiments_data_monthly <- sentiments_data %>%
  mutate(month = lubridate::floor_date(created, "month")) %>%
  group_by(month, account) %>%
  summarise(sentiment = mean(sentiment)) %>%
  ungroup()

# Create the sentiment analysis plot
ggplot(sentiments_data_monthly, aes(x = month, y = sentiment, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.POSIXct("2020-02-01"), linetype = "solid", color = "red", size = 0.7) +
  geom_text(aes(x = as.POSIXct("2020-02-01"), y = max(sentiments_data_monthly$sentiment), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, fontface = "bold", color = "red") +
  labs(x = "Week", y = "Average Sentiment Score", color = "account", linetype = "account") +
  theme_minimal() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +  # Change line colors
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed"))  # Line styles



# Load the required libraries if not already loaded
library(tidyverse)
library(lubridate)

# Assuming 'created' column is already in POSIXct format
scientific_proxies_data <- combined_data %>%
  mutate(month = floor_date(created, unit = "month")) %>%
  mutate(scientific_proxies = str_count(tolower(text), "\\b(numbers|scien\\w*|study|evidence|evidence-based|statistics|data|research)\\b")) %>%
  group_by(month, account) %>%
  summarise(total_proxies = sum(scientific_proxies))

# Create the plot with separate lines for P4AHCF and PNHP
ggplot(scientific_proxies_data, aes(x = month, y = total_proxies, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.POSIXct("2020-02-01"), linetype = "solid", color = "red", size = 0.7) +
  geom_text(aes(x = as.POSIXct("2020-02-01"), y = max(scientific_proxies_data$total_proxies), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, fontface = "bold", color = "red") +
  labs(x = "Month", y = "Count of Scientific Proxies", color = "account", linetype = "account") +
  theme_minimal() +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +  # Change line colors
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed"))  # Line styles

str(combined_data)


# Aggregate data by date and screen_name
medicare_tweets_agg <- medicare_tweets %>%
  group_by(screen_name, date = as.Date(created)) %>%
  summarise(tweet_count = n())

# Plot the aggregated data
ggplot(medicare_tweets_agg, aes(x = date, y = tweet_count, color = screen_name, linetype = screen_name)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(tweet_count), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Date", y = "Count of Tweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Count of Tweets Containing 'MedicareForAll' Over Time") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Define start and end dates for the desired time range
start_date <- as.Date("2019-03-01")
end_date <- as.Date("2021-03-01")

# Filter the data to include one year before and after the COVID-19 start date
medicare_tweets_period <- medicare_tweets %>%
  filter(created >= start_date & created <= end_date)

# Aggregate data by date and screen_name
medicare_tweets_agg <- medicare_tweets_period %>%
  group_by(screen_name, date = as.Date(created)) %>%
  summarise(tweet_count = n(), .groups = 'drop')

# Plot the aggregated data
ggplot(medicare_tweets_agg, aes(x = date, y = tweet_count, color = screen_name, linetype = screen_name)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(tweet_count), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Date", y = "Count of Tweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Count of Tweets Containing 'MedicareForAll' One Year Before and After COVID-19 Start") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Filter tweets for variations of "MedicareForAll"
medicare_tweets <- combined_data %>%
  filter(screen_name %in% c("PNHP", "P4AHCF") &
           (grepl("(#)?MedicareForAll", hashtags, ignore.case = TRUE) | 
              grepl("MedicareForAll|Medicare for All", text, ignore.case = TRUE)))

# Filter the data to include one year before and after the COVID-19 start date
medicare_tweets_period <- medicare_tweets %>%
  filter(created >= start_date & created <= end_date)

# Aggregate medicare_tweets by month and screen_name
medicare_tweets_monthly <- medicare_tweets_period %>%
  group_by(screen_name, date = as.Date(format(created, "%Y-%m-01"))) %>%
  summarise(tweet_count = n(), .groups = 'drop')

# Create a sequence of dates from start_date to end_date by month
date_seq <- seq(from = start_date, to = end_date, by = "month")

# Create a data frame with all combinations of screen names and dates
screen_names <- unique(medicare_tweets$screen_name)
all_combinations <- expand.grid(screen_name = screen_names, date = date_seq)
all_combinations$tweet_count <- 0

# Merge with all_combinations to include zero counts
medicare_tweets_complete <- merge(all_combinations, medicare_tweets_monthly, by = c("screen_name", "date"), all.x = TRUE)
medicare_tweets_complete$tweet_count <- coalesce(medicare_tweets_complete$tweet_count.y, medicare_tweets_complete$tweet_count.x)

# Plot the data
ggplot(medicare_tweets_complete, aes(x = date, y = tweet_count, color = screen_name, linetype = screen_name)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(tweet_count), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Count of Tweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Count of Tweets Containing 'MedicareForAll'") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Aggregate total tweets by month and screen name
total_tweets_monthly <- combined_data %>%
  filter(created >= start_date & created <= end_date) %>%
  group_by(screen_name, date = as.Date(format(created, "%Y-%m-01"))) %>%
  summarise(total_tweet_count = n(), .groups = 'drop')

# Ensure all dates are included in the merge
# This will include dates where P4AHCF tweeted but didn't mention "MedicareForAll"
tweets_with_proportion <- merge(total_tweets_monthly, medicare_tweets_monthly, by = c("screen_name", "date"), all.x = TRUE)
tweets_with_proportion$tweet_count <- coalesce(tweets_with_proportion$tweet_count, 0)

# Calculate the proportion
tweets_with_proportion$proportion <- with(tweets_with_proportion, tweet_count / total_tweet_count * 100)

# Plot without points, just simple dashed and solid lines
ggplot(tweets_with_proportion, aes(x = date, y = proportion, color = screen_name, linetype = screen_name)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(proportion), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Proportion of 'MedicareForAll' Tweets (%)", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Proportion of 'MedicareForAll' Tweets") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Convert the 'created' column to date format if it's not already
combined_data$created <- as.Date(combined_data$created)

# Aggregate total number of tweets per month for each account
total_tweets_monthly <- combined_data %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(total_tweets = n(), .groups = 'drop')

# Plotting the data
ggplot(total_tweets_monthly, aes(x = month, y = total_tweets, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(total_tweets), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Total Number of Tweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Total Number of Tweets for PNHP and P4AHCF") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()


# Updated list of keywords
updated_keywords <- c("race", "racial", "racism", "ethnicity", "minorities", 
                      "immigrant", "immigration", "migrant", "refugee", "asylum seeker", 
                      "low income", "poverty", "economic inequality", "wealth gap", "income disparity")
                                                                  
# Filter tweets mentioning any of the expanded keywords
filtered_tweets <- combined_data %>%
  mutate(keyword_mention = ifelse(str_detect(tolower(text), paste(expanded_keywords, collapse = "|")), 1, 0)) %>%
  filter(keyword_mention == 1)

# Filter tweets mentioning any of the keywords
filtered_tweets <- combined_data %>%
  mutate(keyword_mention = ifelse(str_detect(tolower(text), paste(keywords, collapse = "|")), 1, 0)) %>%
  filter(keyword_mention == 1)

# Aggregate total mentions per month for each account
keyword_mentions_monthly <- filtered_tweets %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(total_mentions = sum(keyword_mention, na.rm = TRUE), .groups = 'drop')

# Plotting the data
ggplot(keyword_mentions_monthly, aes(x = month, y = total_mentions, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(total_mentions), label = "COVID-19 start"), 
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Total Mentions of Keywords", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Mentions of 'Race', 'Low Income', 'Immigrant' in Tweets") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Updated list of keywords
updated_keywords <- c("race", "racial", "racism", "ethnicity", "minorities", 
                      "immigrant", "immigration", "migrant", "refugee", "asylum seeker", 
                      "low income", "poverty", "economic inequality", "wealth gap", "income disparity")

# Filter tweets mentioning any of the updated keywords
keyword_tweets <- combined_data %>%
  mutate(keyword_mention = ifelse(str_detect(tolower(text), paste(updated_keywords, collapse = "|")), 1, 0)) %>%
  filter(keyword_mention == 1) %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(keyword_tweets = n(), .groups = 'drop')

# Aggregate total number of tweets per month for each account
total_tweets_monthly <- combined_data %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(total_tweets = n(), .groups = 'drop')

# Merge and calculate proportions
proportion_data <- merge(total_tweets_monthly, keyword_tweets, by = c("account", "month"))
proportion_data$proportion <- (proportion_data$keyword_tweets / proportion_data$total_tweets) * 100

# Plotting the proportions
ggplot(proportion_data, aes(x = month, y = proportion, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(proportion, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Proportion of Tweets with Keywords (%)", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Proportion of Tweets with Race, Immigrant, and Income Keywords") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()


scientization_keywords <- c("numbers", "scien\\w*", "study", "evidence", "evidence-based", 
                            "statistics", "data", "research", "analysis", "findings", 
                            "clinical", "scientific", "data-driven", "experiment", "quantitative")


# Filter tweets mentioning any of the scientization keywords
scientization_tweets <- combined_data %>%
  mutate(scientization_mention = ifelse(str_detect(tolower(text), paste(scientization_keywords, collapse = "|")), 1, 0)) %>%
  filter(scientization_mention == 1) %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(scientization_tweets = n(), .groups = 'drop')

# Aggregate total number of tweets per month for each account
total_tweets_monthly_sci <- combined_data %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(total_tweets = n(), .groups = 'drop')

# Plotting total scientization mentions
ggplot(scientization_tweets, aes(x = month, y = scientization_tweets, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(scientization_tweets, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Total Scientization Tweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Total Scientization Tweets") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Merge and calculate proportions
proportion_data_sci <- merge(total_tweets_monthly_sci, scientization_tweets, by = c("account", "month"))
proportion_data_sci$proportion <- (proportion_data_sci$scientization_tweets / proportion_data_sci$total_tweets) * 100

# Plotting the proportions
ggplot(proportion_data_sci, aes(x = month, y = proportion, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(proportion, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Proportion of Scientization Tweets (%)", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Proportion of Scientization Tweets") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

start_date <- as.Date("2019-06-01")
end_date <- as.Date("2020-12-01")

# Filter scientization tweets for the defined period
scientization_tweets_period <- scientization_tweets %>%
  filter(month >= start_date & month <= end_date)

# Plotting total scientization mentions for the defined period
ggplot(scientization_tweets_period, aes(x = month, y = scientization_tweets, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(scientization_tweets, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Total Scientization Tweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Total Scientization Tweets (One Year Before and After COVID-19)") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Filter total_tweets_monthly_sci for the defined period
total_tweets_monthly_sci_period <- total_tweets_monthly_sci %>%
  filter(month >= start_date & month <= end_date)

# Merge and calculate proportions for the defined period
proportion_data_sci_period <- merge(total_tweets_monthly_sci_period, scientization_tweets_period, by = c("account", "month"))
proportion_data_sci_period$proportion <- (proportion_data_sci_period$scientization_tweets / proportion_data_sci_period$total_tweets) * 100

# Plotting the proportions for the defined period
ggplot(proportion_data_sci_period, aes(x = month, y = proportion, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(proportion, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Proportion of Scientization Tweets (%)", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Proportion of Scientization Tweets (One Year Before and After COVID-19)") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

# Calculate word count for each tweet
combined_data$word_count <- str_count(combined_data$text, "\\S+")

# Aggregate average word count per month for each account
average_word_count_monthly <- combined_data %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(average_word_count = mean(word_count, na.rm = TRUE), .groups = 'drop')

# Plotting the average word count
ggplot(average_word_count_monthly, aes(x = month, y = average_word_count, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(average_word_count, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  labs(x = "Month", y = "Average Word Count", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Average Word Count for PNHP and P4AHCF") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  theme_minimal()

library(tidytext)

# Unnest tokens
unnested_data <- combined_data %>%
  unnest_tokens(word, text)

# Bing Sentiment Analysis
bing_sentiments <- unnested_data %>%
  inner_join(get_sentiments("bing")) %>%
  count(account, month = floor_date(created, "month"), sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(bing_sentiment = positive - negative)

# AFINN Sentiment Analysis
afinn_sentiments <- unnested_data %>%
  inner_join(get_sentiments("afinn") %>% mutate(value = as.numeric(value)), by = "word") %>%
  group_by(account, month = floor_date(created, "month")) %>%
  summarise(afinn_sentiment = sum(value), .groups = 'drop')

# NRC Sentiment Analysis
nrc_sentiments <- unnested_data %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative")), by = "word") %>%
  count(account, month = floor_date(created, "month"), sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(nrc_sentiment = positive - negative)

# Plot for Bing Sentiment Analysis
ggplot(bing_sentiments, aes(x = month, y = bing_sentiment, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(bing_sentiment, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  labs(x = "Month", y = "Bing Sentiment Score", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Bing Sentiment Analysis Over Time") +
  theme_minimal()


# Plot for AFINN Sentiment Analysis
ggplot(afinn_sentiments, aes(x = month, y = afinn_sentiment, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(afinn_sentiment, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  labs(x = "Month", y = "AFINN Sentiment Score", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("AFINN Sentiment Analysis Over Time") +
  theme_minimal()


# Plot for NRC Sentiment Analysis
ggplot(nrc_sentiments, aes(x = month, y = nrc_sentiment, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(nrc_sentiment, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  labs(x = "Month", y = "NRC Sentiment Score", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("NRC Sentiment Analysis Over Time") +
  theme_minimal()


# Aggregate Sentiment Data
aggregated_sentiments <- full_join(bing_sentiments, afinn_sentiments, by = c("account", "month")) %>%
  full_join(nrc_sentiments, by = c("account", "month")) %>%
  rowwise() %>%
  mutate(average_sentiment = mean(c(bing_sentiment, afinn_sentiment, nrc_sentiment), na.rm = TRUE))

# Plot Aggregated Sentiment Data
ggplot(aggregated_sentiments, aes(x = month, y = average_sentiment, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(average_sentiment, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  labs(x = "Month", y = "Average Sentiment Score", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Aggregated Sentiment Analysis Over Time") +
  theme_minimal()

# Aggregate the total number of retweets and favorites per month for each account
retweets_favorites_monthly <- combined_data %>%
  mutate(month = floor_date(created, "month")) %>%
  group_by(account, month) %>%
  summarise(total_retweets = sum(retweets, na.rm = TRUE),
            total_favorites = sum(favorites, na.rm = TRUE),
            total_tweets = n(), .groups = 'drop')

# Normalize retweets and favorites by the total number of tweets
retweets_favorites_monthly <- retweets_favorites_monthly %>%
  mutate(normalized_retweets = total_retweets / total_tweets,
         normalized_favorites = total_favorites / total_tweets)

ggplot(retweets_favorites_monthly, aes(x = month, y = normalized_retweets, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(normalized_retweets, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  labs(x = "Month", y = "Normalized Retweets", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Normalized Retweets for PNHP and P4AHCF") +
  theme_minimal()

ggplot(retweets_favorites_monthly, aes(x = month, y = normalized_favorites, color = account, linetype = account)) +
  geom_line() +
  geom_vline(xintercept = as.Date("2020-03-01"), color = "red", linetype = "solid") +
  geom_text(aes(x = as.Date("2020-03-01"), y = max(normalized_favorites, na.rm = TRUE), label = "COVID-19 start"),
            hjust = -0.1, vjust = 1.5, color = "red") +
  scale_color_manual(values = c("PNHP" = "black", "P4AHCF" = "black")) +
  scale_linetype_manual(values = c("PNHP" = "solid", "P4AHCF" = "dashed")) +
  labs(x = "Month", y = "Normalized Favorites", color = "Twitter Account", linetype = "Twitter Account") +
  ggtitle("Monthly Normalized Favorites for PNHP and P4AHCF") +
  theme_minimal()


# Define the topics and associated keywords
topics <- list(
  Race = c("race", "racism", "racial discrimination", "racial disparities", "racial inequality", "ethnic", "ethnic group", "ethnicity", "minority"),
  LowIncome = c("poverty", "socioeconomic", "economic inequality", "income gap", "financial hardship", "disadvantaged", "impoverished", "underprivileged"),
  Gender = c("gender equality", "gender discrimination", "gender bias", "gender pay gap", "gender identity", "sexism", "gender stereotypes", "LGBTQ+"),
  Immigration = c("immigrants", "immigration policy", "undocumented", "refugee", "asylum seekers", "border control", "migration", "citizenship"),
  Age = c("elderly", "senior citizens", "aging population", "ageism", "generational", "youth", "baby boomers", "millennials")
)

# Adding topic-specific columns with counts for each keyword group
combined_data <- combined_data %>%
  mutate(Topic_Race = str_count(tolower(text), paste(topics$Race, collapse = "|")),
         Topic_LowIncome = str_count(tolower(text), paste(topics$LowIncome, collapse = "|")),
         Topic_Gender = str_count(tolower(text), paste(topics$Gender, collapse = "|")),
         Topic_Immigration = str_count(tolower(text), paste(topics$Immigration, collapse = "|")),
         Topic_Age = str_count(tolower(text), paste(topics$Age, collapse = "|")))

# Aggregate mention counts by account
mentions_count <- combined_data %>%
  group_by(account) %>%
  summarise(Race = sum(Topic_Race),
            LowIncome = sum(Topic_LowIncome),
            Gender = sum(Topic_Gender),
            Immigration = sum(Topic_Immigration),
            Age = sum(Topic_Age))

# Reshape data for plotting
mentions_count_long <- mentions_count %>%
  pivot_longer(cols = c(Race, LowIncome, Gender, Immigration, Age),
               names_to = "Topic",
               values_to = "Count")

# Plot the data with custom fills and adjusted legend title
ggplot(mentions_count_long, aes(x = Topic, y = Count, fill = account)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), color = "black") +
  scale_fill_manual(values = c("PNHP" = "black", "P4AHCF" = "white"),
                    labels = c("PNHP" = "PNHP", "P4AHCF" = "P4AHCF"),
                    name = "Twitter Account") +
  labs(title = "Mentions of Various Topics by Twitter Account",
       x = "Topic",
       y = "Mention Count") +
  theme_minimal() +
  theme(legend.key = element_blank(),
        legend.title = element_text(size = 12),
        legend.text = element_text(size = 10))


# Define the COVID-19 start date
covid_start_date <- as.Date("2020-03-01")

# Define topics and associated keywords
topics <- list(
  Race = c("race", "racism", "racial discrimination", "racial disparities", "racial inequality", "ethnic", "ethnic group", "ethnicity", "minority"),
  LowIncome = c("poverty", "socioeconomic", "economic inequality", "income gap", "financial hardship", "disadvantaged", "impoverished", "underprivileged"),
  Gender = c("gender equality", "gender discrimination", "gender bias", "gender pay gap", "gender identity", "sexism", "gender stereotypes", "LGBTQ+"),
  Immigration = c("immigrants", "immigration policy", "undocumented", "refugee", "asylum seekers", "border control", "migration", "citizenship"),
  Age = c("elderly", "senior citizens", "aging population", "ageism", "generational", "youth", "baby boomers", "millennials")
)

# Split the data into before and after COVID-19
before_covid <- combined_data %>%
  filter(created < covid_start_date)

after_covid <- combined_data %>%
  filter(created >= covid_start_date)

# Function to create topic-specific columns and aggregate mention counts
process_data <- function(data) {
  data %>%
    mutate(Topic_Race = str_count(tolower(text), paste(topics$Race, collapse = "|")),
           Topic_LowIncome = str_count(tolower(text), paste(topics$LowIncome, collapse = "|")),
           Topic_Gender = str_count(tolower(text), paste(topics$Gender, collapse = "|")),
           Topic_Immigration = str_count(tolower(text), paste(topics$Immigration, collapse = "|")),
           Topic_Age = str_count(tolower(text), paste(topics$Age, collapse = "|"))) %>%
    group_by(account) %>%
    summarise(Race = sum(Topic_Race),
              LowIncome = sum(Topic_LowIncome),
              Gender = sum(Topic_Gender),
              Immigration = sum(Topic_Immigration),
              Age = sum(Topic_Age)) %>%
    pivot_longer(cols = c(Race, LowIncome, Gender, Immigration, Age),
                 names_to = "Topic",
                 values_to = "Count")
}

# Apply the function to both subsets
mentions_count_long_before <- process_data(before_covid)
mentions_count_long_after <- process_data(after_covid)

# Plot for before COVID-19
ggplot(mentions_count_long_before, aes(x = Topic, y = Count, fill = account)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), color = "black") +
  scale_fill_manual(values = c("PNHP" = "black", "P4AHCF" = "white"), name = "Twitter Account") +
  labs(title = "Mentions of Various Topics by Twitter Account (Before COVID-19)",
       x = "Topic",
       y = "Mention Count") +
  theme_minimal()

# Plot for after COVID-19
ggplot(mentions_count_long_after, aes(x = Topic, y = Count, fill = account)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), color = "black") +
  scale_fill_manual(values = c("PNHP" = "black", "P4AHCF" = "white"), name = "Twitter Account") +
  labs(title = "Mentions of Various Topics by Twitter Account (After COVID-19)",
       x = "Topic",
       y = "Mention Count") +
  theme_minimal()


